{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Parser comparison\n",
    "\n",
    "This notebook lets you visualize side-by-side how each parser analyzes a document, and compare the resulting tables.\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Install camelot"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install camelot-py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# Bootstrap and common imports\n",
    "import sys, time\n",
    "\n",
    "sys.path.insert(\n",
    "    0, os.path.abspath(\"\")\n",
    ")  # Prefer the local version of camelot if available\n",
    "import camelot\n",
    "\n",
    "print(f\"Using camelot v{camelot.__version__}.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Select a PDF file to review\n",
    "\n",
    "You can modify the section below to point to a pdf or your choice to visualize the results.  By default, it points to one of the test .pdfs included with camelot.\n",
    "This is seeded with the unit test files for convenience."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "kwargs = {}\n",
    "data = None\n",
    "# pdf_file = \"vertical_header.pdf\"  # test_network_vertical_header\n",
    "# pdf_file, kwargs = \"vertical_header.pdf\", {\"pages\": \"all\"}  # test_network_vertical_headerpages\n",
    "# pdf_file, kwargs = \"background_lines_1.pdf\", {\"process_background\": True} # {\"process_background\": True}  # test_lattice_process_background\n",
    "\n",
    "# pdf_file, kwargs, data = \"superscript.pdf\", {\"flag_size\": True}, data_stream_flag_size  # test_network_flag_size\n",
    "# pdf_file, kwargs = \"superscript.pdf\", {\"flag_size\": True} # , data_stream_flag_size  # test_network_flag_size\n",
    "# pdf_file = \"health.pdf\"  # test_network\n",
    "# pdf_file = \"clockwise_table_2.pdf\"\n",
    "# pdf_file = \"clockwise_table_1.pdf\"\n",
    "# pdf_file = \"foo.pdf\"\n",
    "# pdf_file, kwargs = \"saturation_threshold.pdf\", {\"process_color_background\": False, \"process_background\": True, \"saturation_threshold\": 5, \"threshold_blocksize\": 25}  # \"process_background\": True,\n",
    "\n",
    "# pdf_file = \"birdisland.pdf\"\n",
    "# pdf_file, kwargs = \"diesel_engines.pdf\", {\"pages\": \"4-5\"} # containing multiple pages 2-4 = hybrid error same for 3-4,2-3\n",
    "\n",
    "# pdf_file, kwargs = \"column_span_1.pdf\", {\"copy_text\": \"h\"}\n",
    "# pdf_file = \"tabula/12s0324.pdf\"  # interesting because contains two separate tables\n",
    "# pdf_file, kwargs = \"tabula/12s0324.pdf\", {\"strip_text\": \" ,\\n\"}   # interesting because contains two separate tables\n",
    "# pdf_file, kwargs = \"tabula/us-007.pdf\", {\"table_regions\": [\"320,335,573,505\"]} # test_network_table_regions\n",
    "# pdf_file, kwargs = \"tabula/us-007.pdf\", {\"table_areas\": [\"320,500,573,335\"]} # test_network_table_areas\n",
    "# pdf_file, kwargs = \"detect_vertical_false.pdf\", {\"strip_text\": \" ,\\n\"}  # data_stream_strip_text\n",
    "# pdf_file = \"detect_vertical_false.pdf\" #\n",
    "# pdf_file, kwargs, data = \"tabula/m27.pdf\", {\"columns\": [\"72,95,209,327,442,529,566,606,683\"], \"split_text\": True, }, data_stream_split_text  # data_stream_split_text\n",
    "# pdf_file, kwargs= \"tabula/m27.pdf\", {\"columns\": [\"72,95,209,327,442,529,566,606,683\"], \"split_text\": True, } # , data_stream_split_text  # data_stream_split_text\n",
    "\n",
    "# pdf_file = \"clockwise_table_2.pdf\"  # test_network_table_rotated / test_stream_table_rotated\n",
    "# pdf_file, kwargs = \"clockwise_table_2.pdf\", {\"edge_tol\": 10}  # configurable vgap header search not working\n",
    "# edge_tol 0 gives an error\n",
    "pdf_file = \"vertical_header.pdf\"\n",
    "\n",
    "# pdf_file = \"twotables_2.pdf\"\n",
    "# pdf_file = \"camelot-issue-132-multiple-tables.pdf\"\n",
    "# pdf_file = \"multiple_tables.pdf\" # fixes issue 132\n",
    "# pdf_file, kwargs, data = \"edge_tol.pdf\", {\"edge_tol\": 500}, data_stream_edge_tol\n",
    "# pdf_file, kwargs = \"edge_tol.pdf\", {\"edge_tol\": 500} # , data_stream_edge_tol\n",
    "# pdf_file, kwargs, data = \"edge_tol.pdf\", {}, data_stream_edge_tol\n",
    "\n",
    "# pdf_file = \"tabula/arabic.pdf\"\n",
    "# pdf_file = \"tabula/indictb1h_14.pdf\" # interesting mixed type table\n",
    "# pdf_file = \"tabula/m27.pdf\" # one table spanning multiple pages\n",
    "# pdf_file = \"tabula/mednine.pdf\" # one table spanning multiple pages\n",
    "\n",
    "# pdf_file = \"tabula/spreadsheet_no_bounding_frame.pdf\n",
    "# pdf_file, kwargs = \"diesel_engines.pdf\", {\"pages\": \"4-5\"} # containing multiple pages\n",
    "\n",
    "# pdf_file, kwargs = \"tabula/schools.pdf\", {\"pages\": \"all\"}  # network parser hangs on contour plot\n",
    "\n",
    "filename = os.path.join(\n",
    "    os.path.dirname(os.path.abspath(\".\")), \"src/tests/files\", pdf_file\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "FLAVORS = [\"stream\", \"lattice\", \"network\", \"hybrid\"]\n",
    "tables_parsed = {}\n",
    "parses = {}\n",
    "max_tables = 0\n",
    "for idx, flavor in enumerate(FLAVORS):\n",
    "    timer_before_parse = time.perf_counter()\n",
    "    error, tables = None, []\n",
    "    try:\n",
    "        tables = camelot.read_pdf(filename, flavor=flavor, debug=True, **kwargs)\n",
    "    except ValueError as value_error:\n",
    "        error = f\"Invalid argument for parser {flavor}: {value_error}\"\n",
    "        print(error)\n",
    "    timer_after_parse = time.perf_counter()\n",
    "    max_tables = max(max_tables, len(tables))\n",
    "\n",
    "    parses[flavor] = {\n",
    "        \"tables\": tables,\n",
    "        \"time\": timer_after_parse - timer_before_parse,\n",
    "        \"error\": error,\n",
    "    }\n",
    "\n",
    "    print(f\"##### {flavor} ####\")\n",
    "    print(f\"Found {len(tables)} table(s):\")\n",
    "    for idx, table in enumerate(tables):\n",
    "        flavors_matching = []\n",
    "        for previous_flavor, previous_tables in tables_parsed.items():\n",
    "            for prev_idx, previous_table in enumerate(previous_tables):\n",
    "                if previous_table.df.equals(table.df):\n",
    "                    flavors_matching.append(f\"{previous_flavor} table {prev_idx}\")\n",
    "        print(f\"## Table {idx} ##\")\n",
    "        if flavors_matching:\n",
    "            print(f\"Same as {', '.join(flavors_matching)}.\")\n",
    "        else:\n",
    "            display(table.df)\n",
    "            print(\"\")\n",
    "    tables_parsed[flavor] = tables"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Show tables layout within original document"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# Set up plotting options\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "# %matplotlib inline\n",
    "PLOT_HEIGHT = 12\n",
    "\n",
    "row_count = max(max_tables, 1)\n",
    "plt.rcParams[\"figure.figsize\"] = [PLOT_HEIGHT * len(FLAVORS), PLOT_HEIGHT * row_count]\n",
    "fig, axes = plt.subplots(row_count, len(FLAVORS))\n",
    "plt.subplots_adjust(wspace=0, hspace=0)  # Reduce margins to maximize the display zone\n",
    "\n",
    "fig.suptitle(\"Side-by-side flavor comparison\", fontsize=24, fontweight=\"bold\")\n",
    "for idx, flavor in enumerate(FLAVORS):\n",
    "    parse = parses[flavor]\n",
    "    tables = parse[\"tables\"]\n",
    "    top_ax = axes.flat[idx]\n",
    "    title = f\"{flavor}\\n\" f\"Detected {len(tables)} table(s) in {parse['time']:.2f}s\"\n",
    "    if parse[\"error\"]:\n",
    "        title = title + f\"\\nError parsing: {parse['error']}\"\n",
    "    top_ax.set_title(title, fontsize=12, fontweight=\"bold\")\n",
    "    for table_idx, table in enumerate(tables):\n",
    "        if max_tables > 1:\n",
    "            ax = axes[table_idx][idx]\n",
    "        else:\n",
    "            ax = axes[idx]\n",
    "        # Check if the table has data before attempting to plot it\n",
    "        if (\n",
    "            table.shape[0] > 0 and table.shape[1] > 0\n",
    "        ):  # Check if table has rows and columns\n",
    "            fig = camelot.plot(table, kind=\"grid\", ax=ax)\n",
    "            ax.text(\n",
    "                0.5,\n",
    "                -0.1,\n",
    "                \"{flavor} table {table_idx} - {rows}x{cols}\".format(\n",
    "                    flavor=flavor,\n",
    "                    table_idx=table_idx,\n",
    "                    rows=table.shape[0],\n",
    "                    cols=table.shape[1],\n",
    "                ),\n",
    "                size=14,\n",
    "                ha=\"center\",\n",
    "                transform=ax.transAxes,\n",
    "            )\n",
    "        else:\n",
    "            print(\n",
    "                f\"Skipping plotting for empty table {table_idx} in {flavor}\"\n",
    "            )  # Inform user about the skipped table\n",
    "        timer_after_plot = time.perf_counter()"
   ]
  }
 ],
 "metadata": {
  "file_extension": ".py",
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.0"
  },
  "mimetype": "text/x-python",
  "name": "python",
  "npconvert_exporter": "python",
  "pygments_lexer": "ipython3",
  "version": 3
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
